import requests
from bs4 import BeautifulSoup
import pandas as pd

# Initialize variables
quotes_content = []
quotes_tags = []
quotes_authors = []
quotes_description = []
authors_birthday = []
authors_country = []
authors_genre = []
authors_rating = []
authors_reviews = []

# Login
s = requests.Session()
login_url = "https://quotes.toscrape.com/login"
login_data = {
    'username': '123',
    'password': '123'
}
s.post(login_url, data=login_data)

# Find all quotes in 10 pages
for i in range(1, 11):
    url = f"https://quotes.toscrape.com/page/{i}/"
    response = s.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    quotes = soup.find_all('div', class_='quote')

    for quote in quotes:
        # Content
        content = quote.find('span', class_='text').text.strip("“”")
        quotes_content.append(content)

        # Tags
        tags = [tag.text for tag in quote.find_all('a', class_='tag')]
        quotes_tags.append(tags)

        # Author
        author = quote.find('small', class_='author').text
        quotes_authors.append(author)

        # Access to author's about page
        about_url = [a['href'] for a in quote.find_all('a') if a.text == '(about)'][0]
        about_response = s.get(f"https://quotes.toscrape.com{about_url}")
        about_soup = BeautifulSoup(about_response.text, 'html.parser')

        # Description of the author
        description_tag = about_soup.find('div', class_='author-description')
        description = description_tag.text.strip() if description_tag else 'NA'
        quotes_description.append(description)

        # Access to goodreads website
        author_url = [a['href'] for a in quote.find_all('a') if a['href'].startswith('http://goodreads.com/')][0]
        author_response = s.get(author_url)
        author_soup = BeautifulSoup(author_response.text, 'html.parser')

        # Birthday of the author
        birthday_tag = author_soup.find('div', itemprop='birthDate')
        birthday = birthday_tag.text.strip() if birthday_tag else 'NA'
        authors_birthday.append(birthday)

        # Country of the author
        country_tag = author_soup.find('div', class_='dataTitle', string='Born')
        country = country_tag.next_sibling.replace('in ', '').strip() if country_tag else 'NA'
        authors_country.append(country)

        # Genre of the author
        genre_tag = author_soup.find('div', class_='dataTitle', string='Genre')
        genre = [a.text for a in genre_tag.find_next_sibling('div').find_all('a')] if genre_tag else 'NA'
        authors_genre.append(genre)

        # Average rating of the author
        rating_tag = author_soup.find('span', itemprop='ratingValue')
        rating = rating_tag.text.strip() if rating_tag else 'NA'
        authors_rating.append(rating)

        # Number of reviews
        reviews_tag = author_soup.find('span', itemprop='reviewCount')
        reviews = reviews_tag.text.strip() if reviews_tag else 'NA'
        authors_reviews.append(reviews)

df = pd.DataFrame({
    'Content': quotes_content,
    'Tags': quotes_tags,
    'Author': quotes_authors,
    'Description': quotes_description,
    'Birthday': authors_birthday,
    'Country': authors_country,
    'Genre': authors_genre,
    'AVG_Rating': authors_rating,
    'Reviews': authors_reviews
})

df.to_csv('HuangTianChi Zhu_1931391.csv', index=False)
df


import matplotlib.pyplot as plt

# Filter the top 15
df = pd.read_csv('HuangTianChi Zhu_1931391.csv')
tags = df['Tags'].explode()
tag_counts = tags.value_counts().sort_values(ascending=False)
top_15_tags = tag_counts[:15]
percentages = top_15_tags / top_15_tags.sum() * 100

# Draw the pie plot of top 15 tags
fig, ax = plt.subplots(figsize=(10, 6))
wedges, texts, autotexts = ax.pie(top_15_tags, autopct='%1.1f%%', pctdistance=0.85)
legend_labels = [f'{tag}: {percentage:.1f}%' for tag, percentage in zip(top_15_tags.index, percentages)]
ax.legend(wedges, legend_labels,
          title="Tags",
          loc="center left",
          bbox_to_anchor=(1, 0, 0.5, 1))
plt.setp(autotexts, size=8, weight="bold")
ax.set_title("Top 15 Tags")
plt.show()

# Print the top 5 tags and their global percentages
top_5_tags = tag_counts[:5]
percentages = top_5_tags / len(tags) * 100
print("Top Five Tags (Global Percentage):")
for tag, percentage in zip(top_5_tags.index, percentages):
    print(f"{tag} ({percentage:.1f}%)")

Top Five Tags (Global Percentage):
['love'] (4.0%)
['inspirational'] (3.0%)
[] (3.0%)
['attributed-no-source'] (3.0%)
['humor'] (3.0%)


import matplotlib.pyplot as plt
import seaborn as sns

# Extract year of birth
df = pd.read_csv('HuangTianChi Zhu_1931391.csv')
df0 = df1 = df
df0['BirthYear'] = df0['Birthday'].str[-4:]
df0['BirthYear'] = df0['BirthYear'].apply(lambda x: int(x) if str(x).isdigit() else None)

# Histogram
plt.figure(figsize=(10, 6))
sns.histplot(df['BirthYear'], kde=False, bins=50)
plt.title('Histogram of birth year')
plt.xlabel('Birth year')
plt.ylabel('Frequency')
plt.show()

# Skew
skewness = df0['BirthYear'].skew()
if skewness > 0.1:
    print("The histogram is right skewed.")
elif skewness < -0.1:
    print("The histogram is left skewed.")
else:
    print("The histogram is roughly symmetric.")

# Boxplot
plt.figure(figsize=(10, 6))
sns.boxplot(x=df0['BirthYear'])
plt.title('Boxplot of birth year')
plt.xlabel('Birth year')
plt.show()

# IQR
Q1 = df0['BirthYear'].quantile(0.25)
Q3 = df0['BirthYear'].quantile(0.75)
print(f"The IQR range of birth year is from {int(round(Q1, 0))} to {int(round(Q3, 0))}.")

# Year of birth for the top 5 frequencies
df1['BirthYear'] = df1['Birthday'].str[-4:]
df1['BirthYear'] = df1['BirthYear'].apply(lambda x: int(x) if str(x).isdigit() else 0)
top_three_years = df1['BirthYear'].value_counts().sort_values(ascending=False)[:5]
total = len(df1)
print("Year of birth for the top 5 frequencies:")
for year, count in top_three_years.items():
    percentage = count / total * 100
    print(f"{year} ({percentage:.1f}%)")

The histogram is left skewed.

The IQR range of birth year is from 1879 to 1939.
Year of birth for the top 5 frequencies:
1879 (10.0%)
1965 (10.0%)
1926 (8.0%)
1904 (7.0%)
1835 (6.0%)


import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import pandas as pd
import seaborn as sns

# Converts the AVG Rating and Reviews columns to numeric types
df = pd.read_csv('HuangTianChi Zhu_1931391.csv')
df['AVG_Rating'] = pd.to_numeric(df['AVG_Rating'], errors='coerce')
df['Reviews'] = pd.to_numeric(df['Reviews'].str.replace(',', ''), errors='coerce')

# Scatterplot and boxplot
fig = plt.figure(figsize=(10, 5))
gs = GridSpec(4, 5, figure=fig)
ax1 = fig.add_subplot(gs[0:3, 0:4])
ax2 = fig.add_subplot(gs[3, 0:4])
ax3 = fig.add_subplot(gs[0:3, 4])
ax1.scatter(df['AVG_Rating'], df['Reviews'], s=20)
ax1.set_xlabel('AVG_Rating')
ax1.set_ylabel('Reviews')
ax2.boxplot(df['AVG_Rating'].dropna(), vert=False)
ax2.set_yticks([])
ax3.boxplot(df['Reviews'].dropna(), vert=True)
ax3.set_xlabel('Reviews')
sns.regplot(x='AVG_Rating', y='Reviews', data=df, ax=ax1, scatter_kws={'s':20}, line_kws={'color':'red'})
plt.tight_layout()
plt.show()

# Correlations
correlation = df['AVG_Rating'].corr(df['Reviews'])
print(f'The correlation between AVG_Rating and Reviews is: {correlation}')

# Delete duplicate authors and NA at df
df_unique_authors = df.drop_duplicates(subset='Author').copy()
df_unique_authors = df_unique_authors[(df_unique_authors['AVG_Rating'] != 'NA') & (df_unique_authors['Reviews'] != 'NA')]

# Creates a new column that is  the ratio of 'AVG_Rating' and 'Reviews' to their corresponding column average
avg_rating_mean = df_unique_authors['AVG_Rating'].mean()
reviews_mean = df_unique_authors['Reviews'].mean()
df_unique_authors.loc[:, 'Adjusted_Rating_Review_Product'] = (df_unique_authors['AVG_Rating'] / avg_rating_mean) * (
        df_unique_authors['Reviews'] / reviews_mean)

# Select the top 5 authors and their 'Adjusted_Rating_Review_Product' value
df_sorted = df_unique_authors.sort_values(by='Adjusted_Rating_Review_Product', ascending=False)
top_authors = df_sorted[['Author', 'Adjusted_Rating_Review_Product']].head(5)
print('Top 5 authors with relatively high AVG_Rating and Reviews:')
for i, row in top_authors.iterrows():
    print(f"{row['Author']} ({row['Adjusted_Rating_Review_Product']:.2f})")

The correlation between AVG_Rating and Reviews is: 0.5434018683345798
Top 5 authors with relatively high AVG_Rating and Reviews:
J.K. Rowling (10.04)
Suzanne Collins (6.55)
Stephenie Meyer (4.39)
Jane Austen (3.22)
Haruki Murakami (3.21)


import matplotlib.pyplot as plt
from collections import Counter
from itertools import chain
import pandas as pd
import ast

# Delete duplicate authors and NA at df
df = pd.read_csv('HuangTianChi Zhu_1931391.csv')
df_unique_authors = df.drop_duplicates(subset='Author').copy()
df_unique_authors = df_unique_authors[df_unique_authors['Genre'] != 'NA']

# Convert the Genre column to a list
df_unique_authors['Genre'] = df_unique_authors['Genre'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else [])

# Expand the list in the Genre column and calculate the frequency of each attribute
genre_frequency = Counter(chain.from_iterable(df_unique_authors['Genre']))

# Horizontal barchart
plt.figure(figsize=(15, 10))
plt.barh(list(genre_frequency.keys()), list(genre_frequency.values()))
plt.xlabel('Frequency')
plt.title('Frequency of Each Genre')
plt.show()

# Select top 5 attributes at genre column
print('Top 5 attributes at genre column:')
for genre, freq in genre_frequency.most_common(5):
    print(f'{genre} ({freq})')

# Select your interested genre to filter the author
selected_genres = ['Science', 'Physics', 'Fantasy', 'Science Fiction', 'Fiction']
selected_authors = df_unique_authors[df_unique_authors['Genre'].apply(lambda x: any(genre in x for genre in selected_genres))]['Author']
print('Here are the authors contain the genre of your interest:')
for author in selected_authors:
    print(author)

Top 5 attributes at genre column:
Fiction (12)
Literature & Fiction (12)
Poetry (11)
Biographies & Memoirs (10)
Nonfiction (6)
Here are the authors contain the genre of your interest:
Albert Einstein
J.K. Rowling
Jane Austen
Thomas A. Edison
Douglas Adams
Elie Wiesel
Garrison Keillor
Jorge Luis Borges
George R.R. Martin
James Baldwin
Haruki Murakami
Ernest Hemingway
Charles Bukowski
Suzanne Collins
J.R.R. Tolkien
J.M. Barrie

	Content	Tags	Author	Description	Birthday	Country	Genre	AVG_Rating	Reviews
0	The world as we have created it is a process o...	[change, deep-thoughts, thinking, world]	Albert Einstein	In 1879, Albert Einstein was born in Ulm, Germ...	March 14, 1879	Ulm, kingdom of Württemberg, German empire	[Science, Philosophy, Physics]	4.07	3,124
1	It is our choices, Harry, that show what we tr...	[abilities, choices]	J.K. Rowling	See also: Robert GalbraithAlthough she writes ...	July 31, 1965	Yate, South Gloucestershire, England, The Unit...	[Fiction, Young Adult, Fantasy]	4.46	768,449
2	There are only two ways to live your life. One...	[inspirational, life, live, miracle, miracles]	Albert Einstein	In 1879, Albert Einstein was born in Ulm, Germ...	March 14, 1879	Ulm, kingdom of Württemberg, German empire	[Science, Philosophy, Physics]	4.07	3,124
3	The person, be it gentleman or lady, who has n...	[aliteracy, books, classic, humor]	Jane Austen	Jane Austen was an English novelist whose work...	December 16, 1775	Steventon Rectory, Hampshire, England, The Uni...	[Fiction, Romance, Humor and Comedy]	4.15	264,582
4	Imperfection is beauty, madness is genius and ...	[be-yourself, inspirational]	Marilyn Monroe	Marilyn Monroe (born Norma Jeane Mortenson; Ju...	June 01, 1926	The United States	[Biographies & Memoirs, Nonfiction, Poetry]	4.12	1,005
...	...	...	...	...	...	...	...	...	...
95	You never really understand a person until you...	[better-life-empathy]	Harper Lee	Harper Lee, known as Nelle, was born in the Al...	April 28, 1926	Monroeville, Alabama, The United States	[Literature & Fiction]	4.22	147,085
96	You have to write the book that wants to be wr...	[books, children, difficult, grown-ups, write,...	Madeleine L'Engle	Madeleine L'Engle was an American writer best ...	November 29, 1918	New York City, New York, The United States	[Literature & Fiction, Science Fiction & Fanta...	4.0	64,264
97	Never tell the truth to people who are not wor...	[truth]	Mark Twain	Samuel Langhorne Clemens, better known by his ...	November 30, 1835	Florida, Missouri, The United States	[Literature & Fiction, Short Stories, Biograph...	3.87	61,142
98	A person's a person, no matter how small.	[inspirational]	Dr. Seuss	Theodor Seuss Geisel was born 2 March 1904 in ...	March 02, 1904	Springfield, MA, The United States	[Children's Books]	4.26	67,268
99	... a mind needs books as a sword needs a whet...	[books, mind]	George R.R. Martin	George R. R. Martin was born September 20, 194...	September 20, 1948	Bayonne, New Jersey, The United States	[Fantasy, Science Fiction, Horror]	4.37	230,558

Task 1: Data scraping¶

Task 2: Data analysis¶

Discussion¶